{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "导入所需的库"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn import datasets, model_selection\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "mnist  = datasets.fetch_mldata('MNIST original')\n",
    "data,target = mnist.data, mnist.target"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "查看数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((70000, 784), (70000,))"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape,target.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "构建数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "index = np.random.choice(len(target), 70000, replace=False)\n",
    "\n",
    "def mk_dataset(size):\n",
    "    train_img = [data[i] for i in index[:size]]\n",
    "    train_img = np.array(train_img)\n",
    "    train_target = [target[i] for i in index[:size]]\n",
    "    train_target = np.array(train_target)\n",
    "    return train_img, train_target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "fifty_x, fifty_y = mk_dataset(50000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "查看构建的数据集大小"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((50000, 784), (50000,))"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fifty_x.shape, fifty_y.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((20000, 784), (20000,))"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "twenty_x, twenty_y = mk_dataset(20000)\n",
    "twenty_x.shape, twenty_y.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "构建一个大小为10000的测试集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((10000, 784), (10000,))"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_img = [data[i] for i in index[60000:70000]]\n",
    "test_img1 = np.array(test_img)\n",
    "test_target = [target[i] for i in index[60000:70000]]\n",
    "test_target1 = np.array(test_target)\n",
    "test_img1.shape, test_target1.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "直接调用KNN模型，并不涉及模型的内在设计"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def skl_knn(k, test_data, test_target, stored_data, stored_target):\n",
    "    classifier = KNeighborsClassifier(n_neighbors=k)\n",
    "    classifier.fit(stored_data, stored_target)\n",
    "#     用stored_data，stored_target训练分类器\n",
    "    y_pred = classifier.predict(test_data)\n",
    "    print(classification_report(test_target, y_pred))    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "skl_knn(5, test_img1, test_target1, fifty_x, fifty_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "skl_knn(5, test_img1, test_target1, twenty_x, twenty_y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "使用余弦特征构建一个更快的分类器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import heapq \n",
    "from collections import Counter\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from sklearn.metrics import classificatiom_report\n",
    "from sklearn import datasets, model_selection\n",
    "\n",
    "def cos_knn(k, test_data, test_target, stored_data, stored_target):\n",
    "    cosim = cosine_similarity(test_data, stored_data)\n",
    "    top = [(heapq.nlargest((k), range(len(i)), i.take)) for i in cosim]\n",
    "    top = [[stored_target[j] for j in [i:k]] for i in top]\n",
    "    pred = [max(set(i), key=i.count) for i in top]\n",
    "    pred = np.array(pred)\n",
    "    print(classification_report(test_target, pred))    "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
